rm(list=ls())



library(lmtest)      # version 0.9-40
library(tidyverse)   # version 2.0.0
library(sandwich)    # version 3.1-1
library(estimatr)    # version 1.0.6
library(ggplot2)     # version 3.5.2
library(quanteda)    # version 4.3.0
library(dplyr)       # version 1.1.4
library(stopwords)   # version 2.3
library(knitr)       # version 1.50
library(kableExtra)  # version 1.4.0
library(stringr)     # version 1.5.1
library(seededlda)   # version 1.4.2
library(gtranslate)  # version 0.0.1
library(irr)         # version 0.84.1
library(gt)          # version 1.0.0
library(here)       # version1.0.1

here()                
load("erdoganspeechesraw.RData") ### Main dataset (scraping script is available upon request)

load("ldafinal.RData") ##  output of the topic models

load("cosinesimilaritydataforfigure4.RData")

load("gloverobustnesscheckcosine.RData")

coder_author <- read_csv("author.csv") ##Data for validation (author)


coder_independent <- read_csv("independentcoder.csv") ##Data for validation 2 (independent coder)





options(scipen = 999) ##to avoid scientific notation




### now to be able to get the heteroskedasticity-consistent standard errors and
### print them together with the models in a table, I create a custom function
### this way I produce every single  table in the paper
get_robust <- function(model, model_name = "Model") {
  ct <- coeftest(model, vcov = vcovHC(model))
  stars <- symnum(
    ct[, 4],
    cutpoints = c(0, 0.001, 0.01, 0.05, 1),
    symbols = c("***", "**", "*", "")
  )
  tibble(
    term = rownames(ct),
    !!model_name := paste0(sprintf("%.3f", ct[, 1]), stars, "\n(", sprintf("%.3f", ct[, 2]), ")")
  )
}



# models: list of fitted lm() objects
# order: vector of term order
# labels: named vector for pretty labels
# fit_rows: tribble with extra rows (Obs, R2, etc.)

make_table <- function(models, order, labels, fit_rows = NULL, model_names = NULL) {
  n_models <- length(models)
  if (is.null(model_names)) {
    model_names <- paste0("Model_", seq_len(n_models))
  }
  
  # Create a list of tibbles, each with term and its own column for coef_sig(se)
  robust_list <- Map(get_robust, models, model_names)
  
  # Merge all by 'term'
  robust_tab <- Reduce(function(x, y) full_join(x, y, by = "term"), robust_list)
  
  # Add pretty labels
  robust_tab$label <- labels[robust_tab$term]
  
  # Arrange in order, remove any terms not in order
  robust_tab <- robust_tab %>% filter(term %in% order) %>% slice(match(order, term))
  
  # Select label and all model columns in order
  result <- robust_tab %>% select(label, all_of(model_names))
  
  # Bind fit_rows if given
  if (!is.null(fit_rows)) {
    result <- bind_rows(result, fit_rows)
  }
  result
}



##




### below is the dictionary. ideology = populism + religious_nationalism 

### This also equals to tablea2
#translation of this to english is tablea3
finaldictionary <- dictionary(list(
  religion_nationalism = c("ecda","şehit*","han","gazi*","bölüc*","bayra*","beka*","vatan",  "vatan-millet", "vatana", "vatanda", "vatandaki", "vatandan","vatani*", "vatanı*","vatanla*","vatanp*","vatans*", "feth*", "fetih*","zafer*","şanlıdır", "şanlı","şanla", "dinim*","minar*","iman*","ezan*","ümmet*","nesil*","dindar*","cihat*","şehadet*","değerlerim*"),
  populism = c("hain*", "ihanet*", "irade*", "statüko*","milletim*","lobi*","elit*","halkın*","rant*","kumpas*","şef","şefin","şefine","şefliğini","şeflik","ezile*"),
  ideology = c("ecda","şehit*","han","gazi*","bölüc*","bayra*","beka*","vatan",  "vatan-millet", "vatana", "vatanda", "vatandaki", "vatandan","vatani*", "vatanı*","vatanla*","vatanp*","vatans*", "feth*", "fetih*","zafer*","şanlıdır", "şanlı","şanla", "dinim*","minar*","iman*","ezan*","ümmet*","nesil*","dindar*","cihat*","şehadet*","değerlerim*","hain*", "ihanet*", "irade*", "statüko*","milletim*","lobi*","elit*","halkın*","rant*","kumpas*","şef","şefin","şefine","şefliğini","şeflik","ezile*"),
  proceduralfinal =  c("otoriter*","dikta*" ,"otokra*", "baskı*", "yolsuz*" , "ayrım*" , "çoğul*" ,  "eşit*" , "adil*" , "özgürl*" , "barış*"  ,"arabulucu*",  "müzakere*" , "referandum*" ,  " şeffaf*" ,  "ihlal*","gönüllü*",  "oyl*", "anti-demokratik",  "anayasa*" , "yargı*" ,"yerel*", "adalet*",  "hoşgör*" ,"adaletl*", "adli*" , "hakla*", "azınlık*" , "sandık*","meşru*" ,  "hukuk*" ,"kurum*" ,  "meclis*" ,  "vesaye*", "demok*", "seçilmiş"),
  performancefinal =  c( "savunma","büyüme*", "i̇ş*" ,"eğit*" ,"kalkın*" , "yatır*" , "yüzde*" , "refah*", "köpr*",  "sanay*","enflas*", "faiz*" ,"sağlık*"  ,"işsiz*", "istikrar*" , "teknoloji*" , "kriz*" ,"baraj*", "tesis*", "fabrika*","iktisa*", "konut*", "inşaat*", "ücret*" ,"dolar*", "borç*" ,"maaş", "IMF", "araba*" , "buzdola*", "kuyru*" , "hastan*", "havalimanı", "yol", "sermaye*",   "banka*","endüstri", "proje*" ,"otomobil*", "tarım*","tasarruf*" ,"reform")))


## Preprocesssing
#####manually step by step

#checking the date data
head(erdoganspeeches$Date)

##create a dummy for Coup
erdoganspeeches$Coup <- ifelse(erdoganspeeches$Date >= as.Date("2016-07-15"),1,0)


##Preparing the stopwords to remove
## Build-in stop words from R environment
turkishstopwords <- as.data.frame(stopwords::stopwords("tr", source = "stopwords-iso"))
### does not seem enough I add some manually
additionalstopwords <- c( "bugün" ,"'" ,'acaba', "kıymetli", 'değerli', 'sevgili', "saygıdeğer" ,'ama' , 'ancak', 'arada', 'artık', 'asla', 'aslında', 'ayrıca', 'az', 'bana', 'bazen', 'bazı', 'bazıları', 'belki', 'ben', 'benden', 'beni', 'benim', 'beri',  'bile', 'bilhassa', 'bin', 'bir', 'biraz', 'birçoğu', 'birçok', 'biri', 'birisi', 'birkaç', 'birşey', 'biz', 'bizden', 'bize', 'bizi', 'bizim', 'böyle', 'böylece','bu', 'buna', 'bunda','bundan',  'bunu', 'bunun', 'burada', 'bütün', 'çoğu', 'çoğunu', 'çok', 'çünkü', 'da', 'daha', 'dahi', 'dan', 'de', 'defa', 'değil', 'diğer', 'diğeri', 'diğerleri', 'diye', 'doksan', 'dokuz', 'dolayı', 'dolayısıyla',  'e', 'edecek', 'eden', 'ederek', 'edilecek', 'ediliyor', 'edilmesi', 'ediyor', 'eğer', 'elbette', 'en', 'etmesi', 'etti', 'ettiği', 'ettiğini', 'fakat', 'falan', 'filan', 'gene', 'gereği', 'gerek', 'gibi', 'göre', 'hala', 'halde', 'halen', 'hangi', 'hangisi', 'hani', 'hatta', 'hem', 'henüz', 'hep', 'hepsi', 'her', 'herhangi', 'herkes', 'herkese', 'herkesi', 'herkesin', 'hiç', 'hiçbir', 'hiçbiri',  'i', 'ı', ' içinde', 'iki', 'ile', 'ilgili', 'ise', 'işte', 'itibaren', 'itibariyle', 'kaç', 'kadar', 'karşın', 'kendi', 'kendilerine', 'kendine', 'kendini', 'kendisi', 'kendisine', 'kendisini', 'kez', 'ki',  'kimi', 'kime', 'kimin', 'kimisi', 'kimse', 'madem', 'mi', 'mı', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nedenle', 'nerde', 'nerede', 'nereye', 'neyse', 'niçin', 'nin', 'nın', 'niye', 'nun', 'nün', 'o', 'öbür', 'olan', 'olarak', 'oldu', 'olduğu', 'olduğunu', 'olduklarını', 'olmadı', 'olmadığı', 'olmak', 'olması', 'olmayan', 'olmaz', 'olsa', 'olsun', 'olup', 'olur', 'olursa', 'oluyor', 'on', 'ön', 'ona', 'önce', 'ondan', 'onlar', 'onlara', 'onlardan', 'onları', 'onların', 'onu', 'onun', 'orada', 'öte', 'ötürü', 'öyle', 'oysa', 'pek', 'rağmen', 'sana', 'sanki', 'şayet', 'şekilde', 'sen', 'senden', 'seni' ,'senin', 'şey', 'şeyden', 'şeye', 'şeyi', 'şeyler', 'şimdi', 'siz' , 'siz', 'sizden', 'sizden', 'size', 'sizi', 'sizin', 'sonra', 'şöyle', 'şu', 'şuna', 'şunları', 'şunu', 'ta', 'tabi', 'tabii', 'tam', 'tamam', 'tamamen', 'tarafından', 'tüm', 'tümü', 'u', 'ü', 'r/n', 'r','üç', 'un', 'ün', 'üzere', 'var', 'vardı', 've', 'veya', 'ya', 'yani', 'yapacak', 'yapılan', 'yapılması', 'yapıyor', 'yapmak', 'yaptı', 'yaptığı', 'yaptığını', 'yaptıkları','yedi', 'yi', 'yine', 'yoksa', 'yu', 'zaten', 'zira', 'a', "/r", "'" ,"/", "/n", "n")
additionalstopwords  <- as.data.frame(additionalstopwords)


colnames(turkishstopwords) <- c("words")

colnames(additionalstopwords) <- c("words")
#### This is the stop word list
turkishstopwords_all <- bind_rows(turkishstopwords, additionalstopwords)


##I manually lemmatize the conjugations of "türk" into to the root "türk"
turkeywords <- c("türkiye'n", "türki", "türkiye", "türkiyen")
turkeywordsreplace <- rep("türk", length(turkeywords))

## I follow tge standart preprocessing steps

##make a corpus
corpusversion <- corpus(erdoganspeeches, text_field = "text")

#remove punctuations
tokenizedversion <- tokens(corpusversion,  remove_punct = TRUE)

#remove stop words
tokenizedversion <- tokens_remove(tokenizedversion, turkishstopwords_all$words )

#removing  persistent punctuations
tokenizedversion <- tokens_replace(
  tokenizedversion, 
  types(tokenizedversion), 
  stringi::stri_replace_all_regex(types(tokenizedversion), "[']", ""))



tokenizedversion <- tokens_replace(
  tokenizedversion, 
  types(tokenizedversion), 
  stringi::stri_replace_all_regex(types(tokenizedversion), "[;]", ""))


#numbers,separators, symbols removed. Also I made everything lowercase
tokenizedversion <- tokens(tokenizedversion,remove_numbers = TRUE)
tokenizedversion <- tokens(tokenizedversion,remove_separators = TRUE)
tokenizedversion <- tokens(tokenizedversion,remove_symbols = TRUE)
tokenizedversion <- tokens_tolower(tokenizedversion)

tokenizedversion 
#stemming for the topic models. In dictionary-based models no stemming was applied
stemmedversion1 <- tokens_wordstem(tokenizedversion,language = "turkish")

stemmedversion1

as.character(tokenizedversion[2])
## run the dictionaries
dictionarycounted_dfm <- dfm_lookup(dfm(tokenizedversion), dictionary =    finaldictionary )


dictionarycounted <- convert(  dictionarycounted_dfm, to = "data.frame")



#merge the dictioanry with dataset
erdoganspeeches <- inner_join( erdoganspeeches,dictionarycounted, by = "doc_id")

#transform date to numeric (This makes it easier for the regressions)

erdoganspeeches$nday <- as.numeric( erdoganspeeches$Date)

####renaming the dummy for visualizations later
erdoganspeeches <- erdoganspeeches %>%
  mutate(Period = ifelse(Coup == 0, "Before the Coup", "After the Coup"))


##subsetting the data for the test of hypothesis 
erdogansubsetmain <- subset(erdoganspeeches,erdoganspeeches$Date >= as.Date("2016-01-20") & erdoganspeeches$Date <= as.Date("2017-01-12") )



### Figure 1
fig1 <- erdoganspeeches %>% 
  gather(variable, value, ideology, proceduralfinal, performancefinal) %>% 
  mutate(
    Strategy = case_when(
      variable == "ideology" ~ "Ideological Legitimation",
      variable == "performancefinal" ~ "Performance Legitimation",
      variable == "proceduralfinal" ~ "Procedural Legitimation"
    ),
    rolling_mean = rollmean(value, 7, fill = NA)
  ) %>% 
  ggplot(aes(x = Date, y = sqrt(rolling_mean), color = Strategy)) +
  geom_line() +
  labs(
    y = "Frequency of Legitimation Claim over time", 
    x = "Date"  # removes "Date" from x-axis
  ) +
  theme_minimal(base_size = 8) +
  facet_wrap(~Strategy, scales = "free_y", ncol = 3) +  # strip.position removed
  theme(
    strip.text = element_blank(),          # Hides facet labels
    axis.text.x = element_text(size = 6),  # Keeps year labels
    axis.title.x = element_blank(),        # Hides x-axis title ("Date")
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 7)
  ) +
  geom_vline(xintercept = as.numeric(as.Date("2016-07-15")), 
             linetype = "dashed", color = "red") +
  annotate("text", x = as.Date("2016-07-15"), y = Inf, 
           label = "The Coup Attempt", vjust = 1, hjust = 0, 
           color = "red", size = 2) +
  scale_color_manual(
    values = c(
      "Ideological Legitimation" = "purple",
      "Performance Legitimation" = "darkblue",
      "Procedural Legitimation" = "darkgreen"
    )
  )
fig1


ggsave("figure1.pdf", width = 6, height = 5)

## Below is main models

model1 <- lm(sqrt(ideology) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = erdogansubsetmain )


summary(model1)

coeftest(model1 , vcov = vcovHC(model1))



model2 <- lm(sqrt(proceduralfinal) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = erdogansubsetmain )


summary(model2)

coeftest(model2 , vcov = vcovHC(model2))





model3 <- lm(sqrt(performancefinal) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = erdogansubsetmain )


summary(model3)

coeftest(model3 , vcov = vcovHC(model3))

##now lets reproduce  table 2


#Firstly, we define variable display order and pretty labels
order <- c(
  "Coup",
  "I(nday - 16997)", 
  "I((nday - 16997)^2)", 
  "Coup:I(nday - 16997)", 
  "Coup:I((nday - 16997)^2)",
  "Place1", "Place2", "Place3", "Place4", "Place5", "Place6",
  "log(length)", "(Intercept)"
)

labels <- c(
  "Coup" = "After the Coup Attempt",
  "I(nday - 16997)" = "Time Distance to the Coup Attempt (Secular Trend)",
  "I((nday - 16997)^2)" = "Time Distance to the Coup Attempt² (Secular Trend)",
  "Coup:I(nday - 16997)" = "Time Since the Coup Attempt (Gradual Effect)",
  "Coup:I((nday - 16997)^2)" = "Time Since the Coup Attempt² (Gradual Effect)",
  "Place1" = "Major Audience Civil Society",
  "Place2" = "Educational Institution Event",
  "Place3" = "Major Audience International Actors",
  "Place4" = "Major Audience Political Elite",
  "Place5" = "Developmental Project Event",
  "Place6" = "Religious or Memorial Event",
  "log(length)" = "Length of the Speech (logged)",
  "(Intercept)" = "Constant"
)


table2_list <- list(model1,model2, model3)
table2_names <- c("Col1", "Col2", "Col3")


table2fit <- tribble(
  ~label, ~Col1, ~Col2, ~Col3,
  "Observations", "109", "109", "109",
  "R²", "0.66", "0.59", "0.44", 
  "Adjusted R²", "0.62", "0.54", "0.37"
)


# Run function
table2 <- make_table(
  models = table2_list,
  order = order,
  labels = labels,
  fit_rows = table2fit ,
  model_names = table2_names
)


table2

kable(table2, format = "latex", booktabs = TRUE, align = "lccc", escape = FALSE)
### Figure 2 below


#Figure 2 
fig2 <- erdogansubsetmain %>%
  select(ideology, Period,Date,Place) %>%
  mutate(Coup = as.factor(ifelse(Date >= as.Date("2016-07-15"), 1, 0))) %>%
  ggplot(aes(x = Date, y = sqrt(ideology), color = Period)) +
  geom_point()  +
  geom_smooth(method = "lm_robust", 
              formula = y ~ poly(x, 2), se = TRUE) + 
  scale_color_manual(values = c("Before the Coup" = "black", "After the Coup" = "darkred")) +
  geom_vline(xintercept = as.Date("2016-07-15"), color = "darkblue",
             size = 1, linetype = "dashed") +
  labs(y = "Degree of Ideological Legitimation (Square-root transformed)",
       x = "Date")  + theme_minimal()

fig2
ggsave(fig2, filename = "figure2.pdf", width = 6, height = 5)

### Now I will do the other-dictionary based robustness checks

## all these are reported in the supplementary material



##1 Models with full data
tablea3col1 <- lm(sqrt(ideology) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = erdoganspeeches )


summary(tablea3col1)

coeftest(tablea3col1 , vcov = vcovHC(tablea3col1))



tablea3col2 <- lm(sqrt(proceduralfinal) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = erdoganspeeches )


summary(tablea3col2)

coeftest(tablea3col2 , vcov = vcovHC(tablea3col2))





tablea3col3 <- lm(sqrt(performancefinal) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = erdoganspeeches)


summary(tablea3col3)

coeftest(tablea3col3 , vcov = vcovHC(tablea3col3))




tablea3_list <- list(tablea3col1,tablea3col2, tablea3col3)
tablea3_names <- c("Col1", "Col2", "Col3")


tablea3fit <- tribble(
  ~label, ~Col1, ~Col2, ~Col3,
  "Observations", "1181", "1181", "1181",
  "R²", "0.56", "0.49", "0.54", 
  "Adjusted R²", "0.55", "0.48", "0.54"
)


# Models with full data
tablea3 <- make_table(
  models = tablea3_list,
  order = order,
  labels = labels,
  fit_rows = tablea3fit ,
  model_names = tablea3_names
)

tablea3


kable(tablea3, format = "latex", booktabs = TRUE, align = "lccc", escape = FALSE)




equaltime  <- subset(erdoganspeeches,erdoganspeeches$Date >= as.Date("2014-08-28") & erdoganspeeches$Date <= as.Date("2018-05-20") )



tablea4col1 <- lm(sqrt(ideology) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = equaltime )


summary(tablea4col1)

coeftest(tablea4col1 , vcov = vcovHC(tablea4col1))



tablea4col2 <- lm(sqrt(proceduralfinal) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = equaltime )


summary(tablea4col2)

coeftest(tablea4col2 , vcov = vcovHC(tablea4col2))





tablea4col3 <- lm(sqrt(performancefinal) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = equaltime)


summary(tablea4col3)

coeftest(tablea4col3 , vcov = vcovHC(tablea4col3))


tablea4_list <- list(tablea4col1,tablea4col2, tablea4col3)
tablea4_names <- c("Col1", "Col2", "Col3")


tablea4fit <- tribble(
  ~label, ~Col1, ~Col2, ~Col3,
  "Observations", "397", "397", "397",
  "R²", "0.62", "0.55", "0.53", 
  "Adjusted R²", "0.61", "0.53", "0.51"
)


# Run function
tablea4 <- make_table(
  models = tablea4_list,
  order = order,
  labels = labels,
  fit_rows = tablea4fit ,
  model_names = tablea4_names
)

tablea4 


kable(tablea4, format = "latex", booktabs = TRUE, align = "lccc", escape = FALSE)


equalobservations <- subset(erdoganspeeches,erdoganspeeches$Date >= as.Date("2014-08-28") & erdoganspeeches$Date <= as.Date("2017-05-02") )


tablea5col1 <- lm(sqrt(ideology) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = equalobservations)


summary(tablea5col1)

coeftest(tablea5col1 , vcov = vcovHC(tablea5col1))



tablea5col2 <- lm(sqrt(proceduralfinal) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = equalobservations )


summary(tablea5col2)

coeftest(tablea5col2 , vcov = vcovHC(tablea5col2))





tablea5col3 <- lm(sqrt(performancefinal) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = equalobservations)


summary(tablea5col3)

coeftest(tablea5col3 , vcov = vcovHC(tablea5col3))


tablea5_list <- list(tablea5col1,tablea5col2, tablea5col3)
tablea5_names <- c("Col1", "Col2", "Col3")


tablea5fit <- tribble(
  ~label, ~Col1, ~Col2, ~Col3,
  "Observations", "318", "318", "318",
  "R²", "0.62", "0.60", "0.56", 
  "Adjusted R²", "0.61", "0.58", "0.54"
)


# Run function
tablea5 <- make_table(
  models = tablea5_list,
  order = order,
  labels = labels,
  fit_rows = tablea5fit ,
  model_names = tablea5_names
)

tablea5

kable(tablea5, format = "latex", booktabs = TRUE, align = "lccc", escape = FALSE)


erdogansubsetmain$ideologydividedbylength <- erdogansubsetmain$ideology/erdogansubsetmain$length


erdogansubsetmain$performancedividedbylength <- erdogansubsetmain$performancefinal/erdogansubsetmain$length

erdogansubsetmain$proceduraldividedbylength <- erdogansubsetmain$proceduralfinal/erdogansubsetmain$length



tablea6col1 <- lm(sqrt(ideologydividedbylength) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place  , data = erdogansubsetmain)


summary(tablea6col1)

coeftest(tablea6col1 , vcov = vcovHC(tablea6col1))



tablea6col2 <- lm(sqrt(proceduraldividedbylength) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place   , data = erdogansubsetmain )


summary(tablea6col2)

coeftest(tablea6col2 , vcov = vcovHC(tablea6col2))





tablea6col3 <- lm(sqrt(performancedividedbylength) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place    , data = erdogansubsetmain)


summary(tablea6col3)

coeftest(tablea6col3 , vcov = vcovHC(tablea6col3))


tablea6_list <- list(tablea6col1,tablea6col2, tablea6col3)
tablea6_names <- c("Col1", "Col2", "Col3")


tablea6fit <- tribble(
  ~label, ~Col1, ~Col2, ~Col3,
  "Observations", "109", "109", "109",
  "R²", "0.50", "0.42", "0.43", 
  "Adjusted R²", "0.44", "0.38", "0.35"
)


# Run function
tablea6 <- make_table(
  models = tablea6_list,
  order = order,
  labels = labels,
  fit_rows = tablea6fit ,
  model_names = tablea6_names
)

tablea6

kable(tablea6, format = "latex", booktabs = TRUE, align = "lccc", escape = FALSE)


tablea7col1 <- lm(sqrt(populism) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = erdogansubsetmain)


summary(tablea7col1)

coeftest(tablea7col1 , vcov = vcovHC(tablea7col1))



tablea7col2 <- lm(sqrt(populism) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = erdoganspeeches)


summary(tablea7col2)

coeftest(tablea7col2 , vcov = vcovHC(tablea7col2))




tablea7col3 <- lm(sqrt(religion_nationalism) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = erdogansubsetmain)


summary(tablea7col3)

coeftest(tablea7col3 , vcov = vcovHC(tablea7col3))



tablea7col4 <- lm(sqrt(religion_nationalism) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place + log(length)   , data = erdoganspeeches)


summary(tablea7col4)

coeftest(tablea7col4 , vcov = vcovHC(tablea7col4))


tablea7_list <- list(tablea7col1,tablea7col2, tablea7col3, tablea7col4)
tablea7_names <- c("Populism1", "Populism2", "Religious_Nationalism1","Religious_Nationalism2")


tablea7fit <- tribble(
  ~label, ~Populism1, ~Populism2, ~Religious_Nationalism1, ~Religious_Nationalism2,
  "Observations", "109", "1181", "109", "1181",
  "R²", "0.63", "0.54", "0.58", "0.45",
  "Adjusted R²", "0.59", "0.51", "0.52", "0.44"
)

# Run function
tablea7 <- make_table(
  models = tablea7_list,
  order = order,
  labels = labels,
  fit_rows = tablea7fit ,
  model_names = tablea7_names
)

tablea7

kable(tablea7, format = "latex", booktabs = TRUE, align = "lccc", escape = FALSE)


### Now moving on the parts that relate to the topic models
### I have trained an lda model which you can upload below
##If you uncommand the following line you may also train the model yourself
## It is important note here that the output 
###might differ slightly in terms of order of topics and order of some of the words due to randomness.
### However, the substantial results based on the downstream task do not change.
#ldamodel <- textmodel_lda(dfm(stemmedversion1), k = 11)

terms(ldamodel,n = 20) ### the translated version of this table is table 3




## now  putting the predictions of the topic model to a dataframe
topicsdf <- data.frame(doc_id = rownames(ldamodel$theta),ldamodel$theta, row.names = NULL)


head(topicsdf)

##labeling the texts in a meaningful way
colnames(topicsdf) <- c(
  "doc_id",
  "Education-Culture",
  "Religion",
  "Sports",
  "International Relations",
  "Religious Nationalism",
  "Projects",
  "Pandemic and Economy",
  "Mega Projects",
  "Security",
  "Paternalism and Justice",
  "Unidentifiable"
)


head(topicsdf)

# Creating the top3 topic later for validation
topic_cols <- names(topicsdf)[-1]

topicsdf <- topicsdf %>%
  rowwise() %>%
  mutate(
    top3topic = paste(
      topic_cols[order(c_across(all_of(topic_cols)), decreasing = TRUE)[1:3]],
      collapse = ", "
    ),
    toptopic = topic_cols[which.max(c_across(all_of(topic_cols)))]
  ) %>%
  ungroup()



## joining the topic data to our dataframe

erdoganspeeches <- inner_join( erdoganspeeches,topicsdf , by = "doc_id") 



## to make it easier to run models I 
erdoganspeeches$ideologytopic <- erdoganspeeches$`Religious Nationalism`

##### TABLE 4 correlation tests
cor.test(erdoganspeeches$ideology, erdoganspeeches$ideologytopic, method= "pearson")
cor.test(erdoganspeeches$performancefinal, erdoganspeeches$ideologytopic, method= "pearson")

cor.test(erdoganspeeches$proceduralfinal, erdoganspeeches$ideologytopic, method= "pearson")


##now I will create the table
cor_df <- data.frame(
  `Religious Nationalism \\\\ (topic probability)` = "",
  `Ideological Legitimation` = paste0("$", sprintf("%.2f", {
    r <- cor.test(erdoganspeeches$ideology, erdoganspeeches$ideologytopic)
    r$estimate
  }), ifelse({
    p <- cor.test(erdoganspeeches$ideology, erdoganspeeches$ideologytopic)$p.value
    p < 0.001
  }, "^{***}", "")),
  
  `Procedural Legitimation` = paste0("$", sprintf("%.2f", {
    r <- cor.test(erdoganspeeches$proceduralfinal, erdoganspeeches$ideologytopic)
    r$estimate
  }), ifelse({
    p <- cor.test(erdoganspeeches$proceduralfinal, erdoganspeeches$ideologytopic)$p.value
    p < 0.001
  }, "^{***}", "")),
  
  `Performance Legitimation` = paste0("$", sprintf("%.2f", {
    r <- cor.test(erdoganspeeches$performancefinal, erdoganspeeches$ideologytopic)
    r$estimate
  }), ifelse({
    p <- cor.test(erdoganspeeches$performancefinal, erdoganspeeches$ideologytopic)$p.value
    p < 0.001
  }, "^{***}", ""))
)

rownames(cor_df) <- "Religious Nationalism \\\\ (topic probability)"

cor_df

## I manually generated the Latex table but below is  the automatically generated version

kbl(
  cor_df,
  escape = FALSE,
  booktabs = TRUE,
  align = "lccc",
  caption = paste0(
    "Pearson’s correlation with significance test between three legitimation dictionaries ",
    "and religious nationalism topic. \\\\ \\textit{Note:} *p<0.05; **p<0.01; ***p<0.001"
  )
) %>%
  kable_styling(latex_options = "hold_position")


#### I manually adjusted the table above
###also bare in mind that instead of rounding to -0.01 I put it as -0.008 in the paper


##Now I will subset the data to 6 months before and after versions again
erdogansubsetmain <- subset(erdoganspeeches,erdoganspeeches$Date >= as.Date("2016-01-20") & erdoganspeeches$Date <= as.Date("2017-01-12") )

###table a8
tablea8col1 <- lm(sqrt(ideologytopic) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place  + log(length)   , data = erdogansubsetmain )


summary(tablea8col1)

coeftest(tablea8col1 , vcov = vcovHC(tablea8col1))


tablea8col2 <-lm(sqrt(ideologytopic) ~ Coup + I(nday - 16997) + Coup*I(nday - 16997)  +  I((nday - 16997)^2) + Coup*I((nday - 16997)^2) + Place  + log(length)   , data = erdoganspeeches ) 



summary(tablea8col2)


coeftest(tablea8col2 , vcov = vcovHC(tablea8col2))

##now creating tablea8, which is the table that present the results of models that use
#topic models as DV
tablea8_list <- list(tablea8col1,tablea8col2)
tablea8_names <- c("subset","full")

# Step 5: Add R² and N manually, no standard errors for these rows
tablea8fit <- tribble(
  ~label, ~subset, ~full,
  "Observations", "109", "1181",
  "R²", "0.54", "0.33",
  "Adjusted R²", "0.49", "0.32"
)
# Run function
tablea8 <- make_table(
  models = tablea8_list,
  order = order,
  labels = labels,
  fit_rows = tablea8fit ,
  model_names = tablea8_names
)

tablea8

kable(tablea8, format = "latex", booktabs = TRUE, align = "lccc", escape = FALSE)






## Figure 3
 fig3 <- erdogansubsetmain %>%
  select(ideologytopic, Period,Date,Place) %>%
  mutate(Coup = as.factor(ifelse(Date >= as.Date("2016-07-15"), 1, 0))) %>%
  ggplot(aes(x = Date, y = sqrt(ideologytopic), color = Period)) +
  geom_point()  +
  geom_smooth(method = "lm_robust", 
              formula = y ~ poly(x, 2), se = TRUE) + 
  scale_color_manual(values = c("Before the Coup" = "black", "After the Coup" = "darkred")) +
  geom_vline(xintercept = as.Date("2016-07-15"), color = "darkblue",
             size = 1, linetype = "dashed") +
  labs(y = "Topic Probability for Religious-Nationalism",
       x = "Date")  + theme_minimal()
fig3
ggsave(fig3, filename = "figure3.pdf", width = 6, height = 5)

###### Now moving  to the validation part




fig4 <- ggplot(mean_new, aes(x = as.factor(grouptrue), y = mean, group = interaction(grouptrue, Period), color = Period)) +
  # Decrease the width to the size of each facet label
  geom_point(size = 3, alpha = 0.5, position = position_dodge(width = 0.7)) +
  geom_linerange(aes(ymin = lower, ymax = upper),
                 position = position_dodge(width = 0.7),
                 size = 0.7,
                 show.legend = FALSE) +  # hide legend for geom_linerange
  facet_wrap(~grouptrue, scales = "free_x", ncol = 4) +  # Decrease height and offset overlapping parts
  labs(x = "Issue", y = "Semantic Association with Religious Nationalism Terms") +
  scale_color_manual(values = c("Before the Coup" = "black", "After the Coup" = "darkred")) +  # Set line colors to black and dark red
  theme_minimal() +  # Remove gray background
  theme(strip.placement = "outside",  # Place strip text outside the plot area
        strip.background = element_blank(),  # Remove strip background
        strip.text.x = element_blank())  # Remove strip text from x-axis

fig4

ggsave(fig4, filename = "figure4.pdf", width = 6, height = 5)


##adding the notes column to author data
coder_author$Notes <- NA

##adding the coder information
coder_author$coder <- "author"

coder_independent$coder <- "independent"

### independent coder used turkish labels
#I translate them in the following
translation_map <- c(
  "Din" = "Religion",
  "Dini Milliyet\xe7ilik" = "Religious Nationalism",
  "E?itim K\xfclt\xfcr" = "Education-Culture",
  "E?itim-K\xfclt\xfcr" = "Education-Culture",
  "G\xfcvenlik" = "Security",
  "Mega Projeler" = "Mega Projects",
  "Pandemi ve Ekonomi" = "Pandemic and Economy",
  "Paternalizm ve Adalet" = "Paternalism and Justice",
  "Projeler" = "Projects",
  "Uluslararas? ?li?kiler" = "International Relations",
  "Unidentifiable (Tan?mlanamayan)" = "Unidentifiable",
  "Uluslaras? ?li?kiler" = "International Relations"
)



coder_independent <- coder_independent %>%
  mutate(`Most Prominent Theme` = recode(`Most Prominent Theme`, !!!translation_map))



###combine the datasets
twocoderscombined <- rbind(coder_author,coder_independent)

###transform data to wide format
coders_wide <- twocoderscombined  %>%
  select(Link, `Most Prominent Theme`, coder) %>%
  pivot_wider(names_from = coder, values_from = `Most Prominent Theme`) %>% select(-Link) %>% as.matrix()


##make sure no NAs or anything
# this should lead to no change anyway
coders_wide_clean <- apply(coders_wide, 2, function(x) trimws(as.character(x)))

coders_wide_clean <- as.matrix(coders_wide)
storage.mode(coders_wide_clean) <- "character"

dim(coders_wide_clean)    # Should be 100 2
head(coders_wide_clean)


rownames(coders_wide_clean) <- NULL




# 1. Find all unique themes
all_labels <- unique(as.vector(coders_wide_clean))

# 2. Map to numbers (apply to each column, but with shared levels)
coders_numeric <- apply(coders_wide_clean, 2, function(x) as.numeric(factor(x, levels = all_labels)))

##calculate kripps alpha

kripp.alpha(t(coders_numeric), method = "nominal")

##0.698 rounded to 0.70 and reported in the appendix in text



###now merging the data for goldstandardvalidation
goldstandardsubset  <- erdoganspeeches  %>%
  inner_join(twocoderscombined, by = c("rte_link" = "Link"))  

##Now calculating the F1 scores
##firstly based on top3 topic

f1_top3 <-  goldstandardsubset  %>%
  mutate(
    is_present = str_detect(top3topic, `Most Prominent Theme`) # Check if Most Prominent Theme is in top3topic
  ) %>%
  group_by(coder, `Most Prominent Theme`) %>%
  summarize(
    TP = sum(is_present, na.rm = TRUE),  # True Positives: theme matches top3topic
    FP = sum(!is_present, na.rm = TRUE), # False Positives: theme does not match top3topic
    FN = sum(is.na(is_present) | !is_present, na.rm = TRUE), # False Negatives: missing or not present
    Precision = if_else(TP + FP > 0, TP / (TP + FP), 0),  # Handle division by zero
    Recall = if_else(TP + FN > 0, TP / (TP + FN), 0),     # Handle division by zero
    F1_Score = if_else((Precision + Recall) > 0, 
                       2 * (Precision * Recall) / (Precision + Recall), 
                       0) # Handle division by zero
  ) %>%
  ungroup()

print(f1_top3, n= 22)

## Now, conservative measure (based on  whether the top topics matches)
f1_con <-  goldstandardsubset  %>%
  mutate(
    is_present = str_detect(toptopic, `Most Prominent Theme`) # Check if Most Prominent Theme is in top3topic
  ) %>%
  group_by(coder, `Most Prominent Theme`) %>%
  summarize(
    TP = sum(is_present, na.rm = TRUE),  # True Positives: theme matches top3topic
    FP = sum(!is_present, na.rm = TRUE), # False Positives: theme does not match top3topic
    FN = sum(is.na(is_present) | !is_present, na.rm = TRUE), # False Negatives: missing or not present
    Precision = if_else(TP + FP > 0, TP / (TP + FP), 0),  # Handle division by zero
    Recall = if_else(TP + FN > 0, TP / (TP + FN), 0),     # Handle division by zero
    F1_Score = if_else((Precision + Recall) > 0, 
                       2 * (Precision * Recall) / (Precision + Recall), 
                       0) # Handle division by zero
  ) %>%
  ungroup()



print(f1_con, n= 22)

### I adjust the data to prepare the table
# If your coder column is "author"/"student", relabel to match table header if desired:
f1_con$Coder <- ifelse(f1_con$coder == "author", "Coder 1 (Author)", "Coder 2 (MA Student)")
f1_top3$Coder <- ifelse(f1_top3$coder == "author", "Coder 1 (Author)", "Coder 2 (MA Student)")


# Wide format for both con and Top 3 columns
tab_con <- f1_con %>%
  select(Coder, `Most Prominent Theme`, F1_Score) %>%
  pivot_wider(names_from = Coder, values_from = F1_Score, names_prefix = "con_")

tab_top3 <- f1_top3 %>%
  select(Coder, `Most Prominent Theme`, F1_Score) %>%
  pivot_wider(names_from = Coder, values_from = F1_Score, names_prefix = "Top3_")


tab_combined <- tab_con %>%
  full_join(tab_top3, by = "Most Prominent Theme") 

tab_combined
# Clean up names for your table output
tab_combined <- tab_combined %>% 
  rename(
    `Topic Label` = `Most Prominent Theme`,
    `con (Coder 1)` = `con_Coder 1 (Author)`,
    `con (Coder 2)` = `con_Coder 2 (MA Student)`,
    `Top 3 (Coder 1)` = `Top3_Coder 1 (Author)`,
    `Top 3 (Coder 2)` = `Top3_Coder 2 (MA Student)`
  )

# Ordering the columns
desired_order <- c("Education-Culture", "International Relations", "Mega Projects", "Pandemic and Economy",
                   "Paternalism and Justice", "Projects", "Religion", "Religious Nationalism", "Security",
                   "Sports", "Unidentifiable")
tab_combined <- tab_combined %>%
  mutate(`Topic Label` = factor(`Topic Label`, levels = desired_order)) %>%
  arrange(`Topic Label`)

##adding the average row
micro_avg <- tab_combined %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE))
micro_avg$`Topic Label` <- "Micro-Average"


micro_avg$`Most Prominent Theme` <- NA

tab_combined <- bind_rows(tab_combined, micro_avg)


##Swapping the positions of the columns

tab_combined <- tab_combined %>%
  select(
    `Topic Label`,
    `con (Coder 1)`,
    `Top 3 (Coder 1)`,
    `con (Coder 2)`,
    `Top 3 (Coder 2)`,
    `Most Prominent Theme`
  )
#### Tablea9

tab_combined

tablea9 <- tab_combined
tablea9
### Pretty version
tablea9 %>%
  gt() %>%
  tab_header(title = "Scores by Topic and Coders")

## then I manually adjusted the  table in .tex editor

##now from here we subset back the data to create table a10
# and check the 

#author
coder1 <- subset(goldstandardsubset,goldstandardsubset$coder == "author" )
#independent coder
coder2 <- subset(goldstandardsubset,goldstandardsubset$coder == "independent" )

## Identifying  the documents  where the coders considered Religious Nationalism as the most dominant topic
coder1$RN <-  ifelse(coder1$`Most Prominent Theme` == "Religious Nationalism",1,0)

coder2$RN <-  ifelse(coder2$`Most Prominent Theme` == "Religious Nationalism",1,0)

#correlation tests for author
cor.test(coder1$RN, coder1$ideology,method = "pearson")


cor.test(coder1$RN, coder1$proceduralfinal,method = "pearson")


cor.test(coder1$RN, coder1$performancefinal,method = "pearson")

#correlation tests for independent coder


cor.test(coder2$RN, coder2$ideology)


cor.test(coder2$RN, coder2$proceduralfinal)


cor.test(coder2$RN, coder2$performancefinal)

## Now to create  tablea10  I create a new custom function 


get_corr_star <- function(x, y) {
  ct <- cor.test(x, y)
  est <- ct$estimate
  p <- ct$p.value
  stars <- ifelse(p < 0.001, "***",
                  ifelse(p < 0.01, "**",
                         ifelse(p < 0.05, "*", "")))
  # Format to two decimal places and add stars
  sprintf("%.2f%s", est, stars)
}


# First row: Coder 1
row1 <- c(
  get_corr_star(coder1$RN, as.numeric(coder1$ideology)),
  get_corr_star(coder1$RN, as.numeric(coder1$proceduralfinal)),
  get_corr_star(coder1$RN, as.numeric(coder1$performancefinal))
)
# Second row: Coder 2
row2 <- c(
  get_corr_star(coder2$RN, coder2$ideology),
  get_corr_star(coder2$RN, coder2$proceduralfinal),
  get_corr_star(coder2$RN, coder2$performancefinal)
)

row1

row2
## below is tablea10

tablea10 <- data.frame(
  check.names = FALSE,
  row.names = c("Dominant Theme = Religious Nationalism \\\\ (Coder 1)", 
                "Dominant Theme = Religious Nationalism \\\\ (Coder 2)"),
  "Ideological Legitimation" = c(row1[1], row2[1]),
  "Procedural Legitimation" = c(row1[2], row2[2]),
  "Performance Legitimation" = c(row1[3], row2[3])
)

tablea10

kbl(
  tablea10,
  escape = FALSE,
  booktabs = TRUE,
  align = "lccc",
  caption = paste0(
    "Pearson’s correlation with significance test between a document being coded as religious nationalist by the coder and the frequency of three legitimation dictionaries. \\\\ \\textit{Note:} *p<0.05; **p<0.01; ***p<0.001"
  )
) %>%
  kable_styling(latex_options = "hold_position")



fig4 <- ggplot(mean_new, aes(x = as.factor(grouptrue), y = mean, group = interaction(grouptrue, Period), color = Period)) +
  # Decrease the width to the size of each facet label
  geom_point(size = 3, alpha = 0.5, position = position_dodge(width = 0.7)) +
  geom_linerange(aes(ymin = lower, ymax = upper),
                 position = position_dodge(width = 0.7),
                 size = 0.7,
                 show.legend = FALSE) +  # hide legend for geom_linerange
  facet_wrap(~grouptrue, scales = "free_x", ncol = 4) +  # Decrease height and offset overlapping parts
  labs(x = "Issue", y = "Semantic Association with Religious Nationalism Terms") +
  scale_color_manual(values = c("Before the Coup" = "black", "After the Coup" = "darkred")) +  # Set line colors to black and dark red
  theme_minimal() +  # Remove gray background
  theme(strip.placement = "outside",  # Place strip text outside the plot area
        strip.background = element_blank(),  # Remove strip background
        strip.text.x = element_blank())  # Remove strip text from x-axis

fig4


ggsave(fig4, filename = "figure4.pdf", width = 6, height = 5)


colors <- c("Before the coup" = "#ff7f0e", "After the coup" = "#1f77b4")

figa.1 <- ggplot(robustmodelglove , aes(x = as.factor(Perioddate), y = mean, ymin = lower, ymax = upper,
                              group = Period, color = Period)) +
  geom_point(position = position_dodge(width = 0.5), size = 2) +
  geom_errorbar(position = position_dodge(width = 0.5), width = 0.2) +
  scale_color_manual(values = colors) +
  labs(x = "Period", y = "Semantic Association with Religious Nationalism Terms") +
  facet_wrap(~ group, scales = "free_y", ncol = 1) +
  theme_minimal()

figa.1

ggsave(figa.1, filename = "figurea.1.pdf", width = 8, height = 8)


